Data enginner challenge¶
In this notebook, i try to make a exploratory analysis to try to discover all possible problems with the update of the next_payment_day column in the allowance_backend_table. I start trying to obtain a dataframe with a clean version of the data of the allowance_backend_table an the events caled cleaned_backend_table and cleaned_events:
import pyspark
import plotly.io as pio
pio.renderers.default='notebook'
spark = pyspark.sql.SparkSession.builder.appName("inconsistences_report").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
CURRENT_DAY = "2024-12-03"
import pyspark.sql.functions as spark_f
def flatten_column(df, column):
"""Function to unnest json atributes"""
sub_columns = df.select(f"{column}.*").columns
for sub_column in sub_columns:
df = df.withColumn(
f"{column}_{sub_column}", spark_f.col(f"{column}.{sub_column}")
)
return df.drop(column)
def histogram(df, column, title):
"""Function to generate a histogram chart using plotly"""
import plotly.express as px
fig = px.histogram(
df,
x=column,
title=title,
)
fig.show()
def pie_chart(df, column, title, category_orders=None):
"""Function to generate a pie chart using plotly, the df argument needs to be a pandas dataframe"""
import plotly.express as px
df = (
df.groupby(column)
.count()
.rename(columns={df.columns[0]: "count"})
.reset_index()
.sort_values(column)
)
# print(df.head())
fig = px.pie(
df,
names=column,
values="count",
category_orders=category_orders,
title=title,
)
# fig.update_xaxes(categoryorder="category ascending")
fig.show()
Raw data¶
raw_allowance_backend_table = spark.read.csv(
"../data/allowance_backend_table.csv", header=True
)
raw_allowance_events = spark.read.json("../data/allowance_events.json", multiLine=True)
raw_payments_schedule_backend_table = spark.read.csv(
"../data/payments_schedule_backend_table.csv", header=True
)
Cleaned events¶
allowance_events = flatten_column(raw_allowance_events, "allowance")
allowance_events = flatten_column(allowance_events, "allowance_scheduled")
allowance_events = flatten_column(allowance_events, "event")
allowance_events = flatten_column(allowance_events, "user")
print(allowance_events.dtypes)
cleaned_events = allowance_events.withColumn(
"event_timestamp", allowance_events.event_timestamp.cast("timestamp")
).alias("cleaned_events")
cleaned_events = cleaned_events.dropDuplicates()
cleaned_events.show()
# payments_schedule_backend_table.show()
[('allowance_amount', 'bigint'), ('allowance_scheduled_day', 'string'), ('allowance_scheduled_frequency', 'string'), ('event_name', 'string'), ('event_timestamp', 'string'), ('user_id', 'string')]
+----------------+-----------------------+-----------------------------+-----------------+-------------------+--------------------+
|allowance_amount|allowance_scheduled_day|allowance_scheduled_frequency| event_name| event_timestamp| user_id|
+----------------+-----------------------+-----------------------------+-----------------+-------------------+--------------------+
| 20| saturday| weekly|allowance.created|2024-11-28 07:10:16|38f8d838-ea08-4fd...|
| 5| fifteenth_day| monthly|allowance.created|2024-09-05 13:30:38|9748cdad-69a4-400...|
| 5| saturday| weekly|allowance.created|2024-10-21 17:27:50|8252c070-2698-49d...|
| 3| monday| biweekly| allowance.edited|2024-11-06 12:53:31|3c700c10-3d42-49a...|
| 10| friday| biweekly|allowance.created|2024-11-23 07:56:18|d6331731-dafe-4bb...|
| 20| daily| daily| allowance.edited|2024-10-14 17:33:57|d299d804-0d05-48c...|
| 10| fifteenth_day| monthly| allowance.edited|2024-10-15 18:29:55|d299d804-0d05-48c...|
| 20| first_day| monthly| allowance.edited|2024-09-21 17:02:11|113d8189-3ce0-47b...|
| 1| monday| biweekly| allowance.edited|2024-10-15 10:02:59|c3e2b88d-75fc-449...|
| 5| friday| weekly| allowance.edited|2024-08-06 14:03:58|ba179220-3b79-429...|
| 15| sunday| weekly| allowance.edited|2024-08-10 07:21:42|8c212fdf-895b-4d3...|
| 4| monday| biweekly|allowance.created|2024-09-07 08:14:12|138a678d-b0eb-46a...|
| 10| friday| weekly| allowance.edited|2024-08-08 08:12:00|03bc2115-7d48-4be...|
| 5| first_day| monthly| allowance.edited|2024-11-10 07:28:22|e48bb548-a87a-406...|
| 20| saturday| weekly|allowance.created|2024-08-26 17:05:52|3f046523-49b3-4ef...|
| 10| wednesday| biweekly|allowance.created|2024-10-19 05:03:41|9af3c140-86b0-403...|
| 10| fifteenth_day| monthly| allowance.edited|2024-09-06 17:54:33|63a03a70-da6b-452...|
| 20| sunday| weekly| allowance.edited|2024-11-15 09:12:13|62023832-c468-45f...|
| 200| friday| weekly| allowance.edited|2024-10-31 03:53:41|560c5cf4-5fd8-447...|
| 10| friday| weekly|allowance.created|2024-10-22 13:59:37|61609771-d5b0-46d...|
+----------------+-----------------------+-----------------------------+-----------------+-------------------+--------------------+
only showing top 20 rows
Cleaned backend table¶
Doing some cleaning and selecting only the enable allowances to build the cleaned_backend_table:
allowance_backend_table = raw_allowance_backend_table.withColumnRenamed(
"uuid", "user_id"
)
# to see all possible values of column status
allowance_backend_table.select("status").dropDuplicates().show()
# get only the enabled rows
allowance_backend_table = allowance_backend_table.filter(
allowance_backend_table.status == "enabled"
)
allowance_backend_table = allowance_backend_table.withColumn(
"corrected_updated_at",
spark_f.when(
spark_f.rlike(
allowance_backend_table.updated_at,
spark_f.lit(r"\d+-\d+-\d+T\d+:\d+:\d+.\d+Z"),
),
spark_f.to_timestamp(allowance_backend_table.updated_at),
)
.otherwise(spark_f.from_unixtime(allowance_backend_table.updated_at))
.cast("timestamp"),
)
allowance_backend_table = allowance_backend_table.withColumn(
"creation_date",
spark_f.from_unixtime(allowance_backend_table.creation_date).cast("timestamp"),
)
cleaned_backend_table = (
allowance_backend_table.drop("updated_at")
.withColumnRenamed("corrected_updated_at", "updated_at")
.alias("cleaned_backend_table")
)
print(cleaned_backend_table.dtypes)
cleaned_backend_table.show(truncate=False)
+--------+
| status|
+--------+
| enabled|
|disabled|
+--------+
[('user_id', 'string'), ('creation_date', 'timestamp'), ('frequency', 'string'), ('day', 'string'), ('next_payment_day', 'string'), ('status', 'string'), ('updated_at', 'timestamp')]
+------------------------------------+-------------------+---------+-------------+----------------+-------+--------------------------+
|user_id |creation_date |frequency|day |next_payment_day|status |updated_at |
+------------------------------------+-------------------+---------+-------------+----------------+-------+--------------------------+
|30f4e25e-3e37-462e-8c3c-42f24f54350f|2024-08-28 06:51:49|monthly |fifteenth_day|15 |enabled|2024-10-15 05:00:41.445627|
|6da398ad-079d-49b9-8668-6d7ce4d22683|2024-08-26 05:10:29|monthly |fifteenth_day|15 |enabled|2024-08-26 05:10:29 |
|2d30fe2d-6c32-4b8a-a19b-906184f64f62|2024-11-11 04:12:39|monthly |fifteenth_day|15 |enabled|2024-11-11 04:12:39 |
|c30180f3-b35c-470c-b25a-f12ec4ce3340|2024-09-03 22:56:22|monthly |first_day |1 |enabled|2024-10-01 05:00:37.516552|
|35d3d7c2-2a05-4eae-bdf2-7896e611bbc6|2024-09-04 09:42:52|biweekly |monday |16 |enabled|2024-09-04 09:42:52 |
|9295f1b0-c1cf-4dca-b720-84146395e058|2024-09-04 22:59:01|biweekly |monday |16 |enabled|2024-09-04 22:59:01 |
|c398bb9d-8ba5-4f31-9dd2-97d09a289360|2024-09-04 09:43:27|biweekly |monday |16 |enabled|2024-09-04 09:43:27 |
|1958d8e3-cb69-4190-a47e-58008d526fc8|2024-11-14 19:37:25|biweekly |tuesday |10 |enabled|2024-11-26 05:00:26.493628|
|919436cc-5f02-43d1-9636-d72e1490bb3d|2024-11-17 23:07:28|biweekly |monday |2 |enabled|2024-11-17 23:07:28 |
|6b36d785-5958-4469-a438-9b2e09cbf973|2024-11-21 17:50:19|weekly |thursday |5 |enabled|2024-11-28 05:00:26.049462|
|9097a093-d124-4cbd-b5bc-b27ef6d34f3d|2024-11-07 02:36:17|weekly |saturday |7 |enabled|2024-11-30 05:00:32.954405|
|a0e18e1a-7d45-4157-ab4a-425749d214bc|2024-11-28 05:46:33|monthly |first_day |1 |enabled|2024-12-01 05:01:43.373425|
|89fc688d-51ba-4877-92cc-6428cc4d2c4b|2024-10-27 08:20:51|biweekly |friday |6 |enabled|2024-11-22 05:00:41.160438|
|0e835ee0-7305-4890-bdfe-d80754312cd7|2024-11-20 20:45:51|biweekly |thursday |28 |enabled|2024-11-20 20:45:51 |
|7fb21ac5-9ef1-4a8d-ba9a-ec83c7ee14c9|2024-11-20 20:45:59|biweekly |thursday |28 |enabled|2024-11-20 20:45:59 |
|76a558c9-e51e-47a7-9891-dee6041e0bfd|2024-11-28 13:43:28|monthly |first_day |1 |enabled|2024-12-01 05:00:35.002747|
|6d125ef1-c556-41f3-ab86-911271613e08|2024-11-17 12:27:12|biweekly |friday |29 |enabled|2024-11-27 22:10:19.339488|
|83e1815d-b48f-47e6-89b6-9b52e5875c79|2024-09-27 13:33:44|monthly |first_day |1 |enabled|2024-11-09 18:13:51.566857|
|c97f1147-1f59-4a0d-a1e2-12c65b4fcd67|2024-11-27 16:40:29|weekly |thursday |5 |enabled|2024-11-28 05:00:29.438429|
|3cc7a3ee-4691-4f60-a549-268e813e2c22|2024-08-07 20:55:42|monthly |first_day |1 |enabled|2024-12-01 05:00:29.034582|
+------------------------------------+-------------------+---------+-------------+----------------+-------+--------------------------+
only showing top 20 rows
Truth backend table¶
Building the allowance_backend_table in the truth_backend_table dataframe from cleaned_events to compare with the actual allowance_backend_table:
# We have events of creation and edition, so we need to get the most recent event for each user to get
# a snapshot of what the table allowance_backend_table should be
from pyspark.sql.window import Window
last_event_by_user = (
cleaned_events.groupBy("user_id").agg(
spark_f.max("event_timestamp").alias("event_timestamp")
)
).alias("last_event_by_user")
truth_backend_table = last_event_by_user.join(
cleaned_events, on=["user_id", "event_timestamp"]
).selectExpr("cleaned_events.*", "event_name as last_event_name")
###QUALITY TEST to see if we can have more than one created event per user
# If we have some user with more than one created event, the allowance_backend_table could have more than one row
# per user and this code to generate the truth table would not have the real truth (only the most
# recent created or updated allowance)
more_than_one_allowance = (
cleaned_events.groupBy("user_id", "event_name")
.count()
.filter("count > 1 and event_name = 'allowance.created'")
.count()
)
assert (
more_than_one_allowance == 0
), f"we have {more_than_one_allowance} users with more than one allowance.created event"
###QUALITY TEST to see if we have more than one event per user
duplicated_users = (
truth_backend_table.groupBy("user_id").count().filter("count > 1").count()
)
assert (
duplicated_users == 0
), f"we have {duplicated_users} users with more than one line in truth_backend_table"
truth_backend_table = truth_backend_table.withColumnRenamed(
"allowance_scheduled_frequency", "frequency"
)
truth_backend_table = truth_backend_table.withColumnRenamed(
"allowance_scheduled_day", "day"
)
truth_backend_table = truth_backend_table.withColumnRenamed(
"allowance_amount", "amount"
)
truth_backend_table = truth_backend_table.alias("truth_backend_table")
truth_backend_table.show()
+--------------------+-------------------+------+-------------+---------+-----------------+-----------------+ | user_id| event_timestamp|amount| day|frequency| event_name| last_event_name| +--------------------+-------------------+------+-------------+---------+-----------------+-----------------+ |38f8d838-ea08-4fd...|2024-11-28 07:10:16| 20| saturday| weekly|allowance.created|allowance.created| |113d8189-3ce0-47b...|2024-09-21 17:02:11| 20| first_day| monthly| allowance.edited| allowance.edited| |8c212fdf-895b-4d3...|2024-08-10 07:21:42| 15| sunday| weekly| allowance.edited| allowance.edited| |138a678d-b0eb-46a...|2024-09-07 08:14:12| 4| monday| biweekly|allowance.created|allowance.created| |03bc2115-7d48-4be...|2024-08-08 08:12:00| 10| friday| weekly| allowance.edited| allowance.edited| |e48bb548-a87a-406...|2024-11-10 07:28:22| 5| first_day| monthly| allowance.edited| allowance.edited| |9af3c140-86b0-403...|2024-10-19 05:03:41| 10| wednesday| biweekly|allowance.created|allowance.created| |62023832-c468-45f...|2024-11-15 09:12:13| 20| sunday| weekly| allowance.edited| allowance.edited| |61609771-d5b0-46d...|2024-10-22 13:59:37| 10| friday| weekly|allowance.created|allowance.created| |804517cc-bf10-464...|2024-09-10 06:08:11| 20|fifteenth_day| monthly|allowance.created|allowance.created| |4e86cb31-1384-49c...|2024-10-27 15:11:53| 2| monday| biweekly|allowance.created|allowance.created| |d0287dea-599e-44c...|2024-11-10 07:17:44| 5| first_day| monthly| allowance.edited| allowance.edited| |2f92c80d-687a-485...|2024-08-07 13:45:58| 10| thursday| weekly|allowance.created|allowance.created| |77cb5202-9e1f-463...|2024-09-19 17:34:33| 5| sunday| weekly|allowance.created|allowance.created| |9de6a60f-e75c-402...|2024-09-13 02:38:54| 10| friday| biweekly| allowance.edited| allowance.edited| |957e0f46-41b7-428...|2024-09-30 16:31:56| 20|fifteenth_day| monthly|allowance.created|allowance.created| |a50295d7-5c2b-41b...|2024-11-12 18:20:34| 5| monday| weekly|allowance.created|allowance.created| |c3e2b88d-75fc-449...|2024-10-15 10:03:00| 1| monday| biweekly| allowance.edited| allowance.edited| |b92cf73a-c48f-4a4...|2024-12-01 10:53:07| 5| saturday| weekly| allowance.edited| allowance.edited| |a47b61f0-bd9c-4b7...|2024-10-20 17:30:33| 2| monday| biweekly|allowance.created|allowance.created| +--------------------+-------------------+------+-------------+---------+-----------------+-----------------+ only showing top 20 rows
Analysing discrepancies between the events and the backend table¶
There are some allowances that has no events in the events table, as show bellow:
### QUALITY TEST: Aconding to the events tables it should not be possible for a user to have more than one allowance
more_than_one_allowances = (
cleaned_backend_table.groupBy("user_id").count().filter("count > 1").count()
)
assert more_than_one_allowances == 0
comparisson_backend_table = cleaned_backend_table.join(
truth_backend_table, on="user_id", how="left"
)
### This lines of the allowance_backend_table should not exist because there are no events to support them
# on the events table
comparisson_backend_table.filter("truth_backend_table.user_id is null").select(
"cleaned_backend_table.*"
).show()
+--------------------+-------------------+---------+---------+----------------+-------+--------------------+ | user_id| creation_date|frequency| day|next_payment_day| status| updated_at| +--------------------+-------------------+---------+---------+----------------+-------+--------------------+ |bb950bcb-0760-417...|2024-09-20 23:53:12| weekly| friday| 27|enabled| 2024-09-20 23:53:12| |59a14e2a-27a1-4cb...|2024-11-23 18:50:05| monthly|first_day| 1|enabled|2024-12-01 05:00:...| |a616b4c6-482d-4e5...|2024-11-23 18:49:36| monthly|first_day| 1|enabled|2024-12-01 05:01:...| |a6ca6993-3b73-4a8...|2024-11-23 18:49:28| monthly|first_day| 1|enabled|2024-12-01 05:01:...| |cd2d5904-7ca0-4d0...|2024-11-23 18:49:47| monthly|first_day| 1|enabled| 2024-11-23 18:49:47| +--------------------+-------------------+---------+---------+----------------+-------+--------------------+
As shown below, there are allowances with different values for the day and frequency columns compared to the values in the last event for those allowances.
print("Users with discrepancies in the day column:")
comparisson_backend_table.filter(
"truth_backend_table.day != cleaned_backend_table.day"
).selectExpr(
"user_id", "cleaned_backend_table.day", "truth_backend_table.day as truth_day"
).show(
truncate=False
)
print("Users with discrepancies in the frequency column:")
comparisson_backend_table.filter(
"truth_backend_table.frequency != cleaned_backend_table.frequency"
).selectExpr(
"user_id",
"cleaned_backend_table.frequency",
"truth_backend_table.frequency as truth_frequency",
).show(
truncate=False
)
Users with discrepancies in the day column: +------------------------------------+---------+---------+ |user_id |day |truth_day| +------------------------------------+---------+---------+ |308b2b9c-d49d-4b2d-947c-5b2370da090f|first_day|sunday | +------------------------------------+---------+---------+ Users with discrepancies in the frequency column: +------------------------------------+---------+---------------+ |user_id |frequency|truth_frequency| +------------------------------------+---------+---------------+ |308b2b9c-d49d-4b2d-947c-5b2370da090f|monthly |weekly | +------------------------------------+---------+---------------+
Building the dataframes with the expected next_payment_day¶
def build_backend_truth_table(frequency, truth_function):
"""Builder function to build dataframes with the expected next_payment_day for some frequency
as arguments are expected:
- frequency: indicating the frequency we are calculating
- truth_function: a reference to a function containing the logic to calculate the expected next_payment_day
"""
return (
comparisson_backend_table.filter(
f"truth_backend_table.frequency = '{frequency}'"
)
.withColumn(
"truth_next_payment_day",
truth_function(
"truth_backend_table.day", "truth_backend_table.event_timestamp"
),
)
.selectExpr(
"user_id",
"cleaned_backend_table.frequency",
"cleaned_backend_table.day",
"cleaned_backend_table.creation_date",
"cleaned_backend_table.updated_at",
"cleaned_backend_table.next_payment_day",
"last_event_name",
"event_timestamp as last_event_timestamp",
"truth_next_payment_day",
)
.withColumn(
"correct",
spark_f.when(
spark_f.col("truth_next_payment_day")
== spark_f.col("next_payment_day"),
spark_f.lit("correct"),
).otherwise(spark_f.lit("wrong")),
)
.withColumn(
"diference_next_payment_day",
spark_f.col("next_payment_day") - spark_f.col("truth_next_payment_day"),
)
)
Daily¶
## daily discrepancies
def next_payment_day_daily_frequency(day, event_timestamp, current_day=None):
"""function with the logic to calculate the next_payment_day for daily frequency"""
if current_day is None:
return spark_f.day(spark_f.date_add(spark_f.lit(CURRENT_DAY), 1))
return spark_f.day(spark_f.date_add(spark_f.col(current_day), 1))
backend_table_daily_truth = build_backend_truth_table(
"daily", next_payment_day_daily_frequency
)
pd_comparisson_backend = backend_table_daily_truth.toPandas()
histogram(
pd_comparisson_backend,
"correct",
"Correctness of the next_payment_day for daily frequency",
)
pie_chart(
pd_comparisson_backend,
"correct",
"Correct and wrong cases for the daily frequency",
category_orders={"correct": ["correct", "wrong"]},
)
Monthly¶
comparisson_backend_table.filter("truth_backend_table.frequency = 'monthly'").select(
"cleaned_backend_table.day"
).drop_duplicates().show()
+-------------+ | day| +-------------+ |fifteenth_day| | first_day| +-------------+
# comparisson_backend_table.show()
def next_payment_day_montlhy_frequency(day_column, event_timestamp):
# current_day = spark_f.day(spark_f.to_date(spark_f.lit(CURRENT_DAY)))
day_column = (
spark_f.when(spark_f.col(day_column) == "fifteenth_day", spark_f.lit(15))
.when(spark_f.col(day_column) == "first_day", spark_f.lit(1))
.otherwise(spark_f.lit(None))
)
return day_column
backend_table_monthly_truth = build_backend_truth_table(
"monthly", next_payment_day_montlhy_frequency
)
pd_comparisson_backend = backend_table_monthly_truth.toPandas()
histogram(
pd_comparisson_backend,
"correct",
"Correctness of the next_payment_day for monthly frequency",
)
pie_chart(
pd_comparisson_backend,
"correct",
"Correct and wrong cases for the monthly frequency",
category_orders={"correct": ["correct", "wrong"]},
)
Weekly¶
comparisson_backend_table.filter("truth_backend_table.frequency = 'weekly'").select(
"truth_backend_table.day"
).dropDuplicates().show()
+---------+ | day| +---------+ | sunday| | thursday| | monday| |wednesday| | saturday| | friday| | tuesday| +---------+
# comparisson_backend_table.show()
def int_day_of_week(column):
DAYS_OF_WEEK = [
"sunday",
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
]
day_of_week = (
spark_f.when(
spark_f.col(column).isin(DAYS_OF_WEEK),
spark_f.expr(
f"array_position(array({','.join([ f"'{day}'" for day in DAYS_OF_WEEK])}), {column})"
),
)
.otherwise(spark_f.lit(None))
.cast("int")
)
return day_of_week
def next_day_of_week(timestamp, day_of_week):
dif = day_of_week - spark_f.dayofweek(timestamp)
next_day = spark_f.when(dif > 0, spark_f.date_add(timestamp, dif)).otherwise(
spark_f.date_add(timestamp, dif + 7)
)
return next_day
def next_payment_day_weekly_frequency(day_column, event_timestamp, current_day=None):
if current_day is None:
current_day = spark_f.to_date(spark_f.lit(CURRENT_DAY))
else:
current_day = spark_f.to_date(current_day)
day_of_week = int_day_of_week(day_column)
next_day = next_day_of_week(current_day, day_of_week)
return spark_f.day(next_day)
backend_table_weekly_truth = build_backend_truth_table(
"weekly", next_payment_day_weekly_frequency
)
# comparisson_backend_table.filter("truth_backend_table.frequency = 'weekly'").show()
pd_comparisson_backend = backend_table_weekly_truth.toPandas()
histogram(
pd_comparisson_backend,
"correct",
"Correctness of the next_payment_day for weekly frequency",
)
pie_chart(
pd_comparisson_backend,
"correct",
"Correct and wrong cases for the weekly frequency",
category_orders={"correct": ["correct", "wrong"]},
)
Biweekly¶
def next_payment_day_biweekly_frequency(
day_column, last_event_date_column, current_day=None
):
if current_day is None:
current_day = spark_f.to_date(spark_f.lit(CURRENT_DAY))
else:
current_day = spark_f.to_date(spark_f.col(current_day))
day_of_week = int_day_of_week(day_column)
current_day_of_week = spark_f.dayofweek(current_day)
## I'm considering that the first week of the payment is the first occurence of the day
# after or during the last edition day of the allowance
first_payment_day = next_day_of_week(last_event_date_column, day_of_week)
dif_days = spark_f.date_diff(current_day, first_payment_day)
next_biweek_day = spark_f.date_add(current_day, 14 - (dif_days % (7 * 2)))
next_day = spark_f.when(
first_payment_day > current_day, first_payment_day
).otherwise(next_biweek_day)
return spark_f.day(next_day)
backend_table_biweekly_truth = build_backend_truth_table(
"biweekly", next_payment_day_biweekly_frequency
)
# comparisson_backend_table.filter("truth_backend_table.frequency = 'weekly'").show()
pd_comparisson_backend = backend_table_biweekly_truth.toPandas()
histogram(
pd_comparisson_backend,
"correct",
"Correctness of the next_payment_day for biweekly frequency",
)
pie_chart(
pd_comparisson_backend,
"correct",
"Correct and wrong cases for the biweekly frequency",
category_orders={"correct": ["correct", "wrong"]},
)
Among all possible frequencies, only the monthly frequency has no errors. This is likely because the monthly frequency does not require updates to change the value of the next_payment_day column. It remains the same unless the user edits the schedule day. All frequencies that require updates to the next_payment_day have errors.
Considering that the longest possible period for an allowance is monthly, it is expected that no allowance has an updated_at date older than 30 days. However, there are some allowances with an updated_at date older than 30 days, as shown in the next graphic. It is possible for an allowance to have an updated_at date older than 30 days if the user re-enables an allowance that was previously disabled. Unfortunately, I do not have information about the specific times when allowances were enabled or disabled to confirm this hypothesis.
# dataframe with all lines of the allowance_backend_table with the expected next_payment_day considering the CURRENT_DAY
all_backend_table_truth = (
backend_table_daily_truth.unionByName(backend_table_monthly_truth)
.unionByName(backend_table_biweekly_truth)
.unionByName(backend_table_weekly_truth)
)
all_backend_table_truth = all_backend_table_truth.withColumn(
"how_old_was_last_update",
spark_f.date_diff(
spark_f.to_timestamp(spark_f.lit(CURRENT_DAY)), spark_f.col("updated_at")
),
)
histogram(
all_backend_table_truth.toPandas(),
"how_old_was_last_update",
"Time in days from last update",
)
As observed in the following graphs, the distribution of errors for allowances that were edited is similar to those that were never edited. This indicates that the issue is not related to the edit action of an allowance.
to_graph = all_backend_table_truth.filter("last_event_name like '%created'")
histogram(to_graph.toPandas(), "correct", "Correctness of allowances never edited")
to_graph = all_backend_table_truth.filter("last_event_name like '%edited'")
histogram(
to_graph.toPandas(),
"correct",
"Correctness of allowances with at least one edition",
)
In the daily frequency, 86.4% of the allowances have the next_payment_day equal to updated_at + 1 day. Considering this and the fact that almost every daily allowance has an incorrect next_payment_day when using the current date of 2024-11-03, it suggests that the backend job responsible for updating the next_payment_day column for the daily frequency is either not running every day or is encountering issues. This can be observed in the following graph.
# all_backend_table_truth.filter("correct = 'correct' and day = 'daily'").show()
test_next_update_daily = all_backend_table_truth.withColumn(
"next_day_from_update", next_payment_day_daily_frequency(None, None, "updated_at")
).filter(
"frequency = 'daily'"
) # .show()
test_next_update_daily = test_next_update_daily.withColumn(
"update_correct",
spark_f.when(
spark_f.col("next_day_from_update") == spark_f.col("next_payment_day"),
spark_f.lit("correct"),
).otherwise(spark_f.lit("wrong")),
)
# test_next_update.show()
pie_chart(
test_next_update_daily.toPandas(),
"update_correct",
"Correct and wrong cases for the daily frequency using the updated_at as the current day",
category_orders={"update_correct": ["correct", "wrong"]},
)
# all_backend_table_truth.filter("date(updated_at) = '2024-11-03'").show()
The majority of the discrepancies in the next_payment_day column, when compared to the next_day_from_update, show a difference of 1 day. This suggests that there may have been instances of double updating, where the update process increments the next_payment_day without considering the current day or some kind of parallel processing problem.
test_next_update_errors = (
test_next_update_daily.filter("update_correct = 'wrong'")
.withColumn(
"next_day_to_next_day_from_update_dif",
test_next_update_daily.next_payment_day
- test_next_update_daily.next_day_from_update,
)
.select(
"user_id",
"frequency",
"updated_at",
"next_day_from_update",
"next_payment_day",
"next_day_to_next_day_from_update_dif",
)
)
test_next_update_errors.show()
+--------------------+---------+--------------------+--------------------+----------------+------------------------------------+ | user_id|frequency| updated_at|next_day_from_update|next_payment_day|next_day_to_next_day_from_update_dif| +--------------------+---------+--------------------+--------------------+----------------+------------------------------------+ |e792e7d1-67ae-444...| daily| 2024-11-25 21:18:00| 26| 27| 1.0| |2cb3cd03-75f8-429...| daily| 2024-11-14 22:58:18| 15| 16| 1.0| |7f1f0ab3-4eb3-418...| daily|2024-10-06 21:44:...| 7| 8| 1.0| |ddcfef7f-f431-4d2...| daily|2024-09-11 10:51:...| 12| 4| -8.0| |337875a6-6c2b-49a...| daily| 2024-11-25 21:18:13| 26| 27| 1.0| |9be9e091-4f3a-4a6...| daily|2024-10-15 22:26:...| 16| 17| 1.0| +--------------------+---------+--------------------+--------------------+----------------+------------------------------------+
In the weekly frequency, almost all allowances have a correct next_payment_day if we consider the current day as the updated_at date. This is similar to what happened with the daily frequency. Therefore, it indicates that the job responsible for updating the next_payment_day is either not running daily or is encountering issues.
test_next_update_weekly = all_backend_table_truth.filter(
"frequency = 'weekly'"
).withColumn(
"next_day_from_update", next_payment_day_weekly_frequency("day", None, "updated_at")
)
test_next_update_weekly = test_next_update_weekly.withColumn(
"update_correct",
spark_f.when(
spark_f.col("next_day_from_update") == spark_f.col("next_payment_day"),
"correct",
).otherwise("wrong"),
)
pie_chart(
test_next_update_weekly.toPandas(),
"update_correct",
"Correct and wrong cases for the weekly frequency using the updated_at as the current day",
category_orders={"update_correct": ["correct", "wrong"]},
)
The wrong cases for the weekly frequency when considering the updated_at as the current date:
test_next_update_weekly.filter("update_correct = 'wrong'").withColumn(
"next_day_to_next_day_from_update_dif",
test_next_update_weekly.next_payment_day
- test_next_update_weekly.next_day_from_update,
).select(
"user_id",
"frequency",
"updated_at",
"next_day_from_update",
"next_payment_day",
"next_day_to_next_day_from_update_dif",
).orderBy(
"next_day_to_next_day_from_update_dif"
).show()
+--------------------+---------+--------------------+--------------------+----------------+------------------------------------+ | user_id|frequency| updated_at|next_day_from_update|next_payment_day|next_day_to_next_day_from_update_dif| +--------------------+---------+--------------------+--------------------+----------------+------------------------------------+ |6ca19a12-7b20-425...| weekly|2024-11-29 21:34:...| 30| 7| -23.0| |4ba547e8-2e7c-443...| weekly|2024-11-28 21:31:...| 29| 6| -23.0| |d1935063-ebf8-466...| weekly|2024-09-23 22:14:...| 24| 1| -23.0| |d9a09694-46e5-4c4...| weekly|2024-11-29 22:42:...| 30| 7| -23.0| |bc41c142-da12-43e...| weekly|2024-11-17 21:57:...| 22| 8| -14.0| |acdd1c63-bbc7-41e...| weekly|2024-11-23 18:31:...| 29| 15| -14.0| |f9c105e9-828d-467...| weekly|2024-11-17 21:57:...| 22| 8| -14.0| |30cc457a-c064-428...| weekly|2024-11-22 16:44:...| 29| 15| -14.0| |08868477-f5d2-4f3...| weekly|2024-11-01 09:52:...| 8| 1| -7.0| |17aa6e97-d51d-488...| weekly|2024-11-17 12:42:...| 24| 17| -7.0| |4d50e137-2881-4c2...| weekly|2024-11-22 22:24:...| 29| 22| -7.0| |c060b68f-a5ad-492...| weekly|2024-11-09 13:25:...| 10| 6| -4.0| |4720f736-3034-4b6...| weekly|2024-11-04 21:13:...| 8| 11| 3.0| |f1613d85-2abd-426...| weekly| 2024-12-01 23:45:33| 2| 9| 7.0| |33444b84-eb01-4ba...| weekly|2024-10-02 22:23:...| 3| 10| 7.0| |60dc27fa-6ba6-43d...| weekly|2024-10-15 22:45:...| 16| 23| 7.0| |f741b3eb-ce92-4ea...| weekly|2024-11-14 22:52:...| 15| 22| 7.0| |ef83e1ca-eadf-4d4...| weekly|2024-09-12 23:49:...| 13| 20| 7.0| |1649fee7-c424-4fb...| weekly|2024-10-12 22:27:...| 13| 22| 9.0| |a50295d7-5c2b-41b...| weekly|2024-12-01 23:14:...| 2| 25| 23.0| +--------------------+---------+--------------------+--------------------+----------------+------------------------------------+
Conclusion¶
There appears to be a scheduling issue as the update process is not occurring daily. Additionally, there are some anomalies that might be due to the job not being idempotent and running multiple times on certain days.